library(tidyr)
library(ggplot2)
library(MASS)
library(ggdensity)
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Load data
data_PJM <- fread('PJM Data.csv')
data_Chicago <- fread('Data Chicago.csv')
data_Milwaukee <- fread('Data Milwaukee.csv')
data_CorpusChristi<- fread('Data Corpus Christi.csv')
data_Austin <- fread('Data Austin.csv')
#Exploring data
names(data_PJM)
## [1] "datetime_beginning_utc" "datetime_beginning_ept" "pnode_id"
## [4] "pnode_name" "voltage" "equipment"
## [7] "type" "zone" "system_energy_price_rt"
## [10] "total_lmp_rt" "congestion_price_rt" "marginal_loss_price_rt"
## [13] "system_energy_price_da" "total_lmp_da" "congestion_price_da"
## [16] "marginal_loss_price_da" "spread" "date"
## [19] "hour" "am_pm" "day"
## [22] "day_of_year" "month_num" "month"
## [25] "year" "day_of_week" "weekend"
## [28] "isHoliday" "holiday"
#Units of observation
class(data_PJM$datetime_beginning_utc)
## [1] "character"
#Identify initial date
min(data_PJM$date)
## [1] "2021-01-01"
#Unique price_node names
head(unique(data_PJM$pnode_name),5)
## [1] "PJM" "BRANDONSH" "BRUNSWICK" "COOKSTOWN" "DOVER"
#number of different pricing nodes
length(unique(data_PJM$pnode_name))
## [1] 339
#Interesting fields to compares
data_PJM[, .(date, congestion_price_rt)]
library(dplyr)
#dataset of all observation on a holiday
holiday_PJM <- data_PJM[isHoliday == 'TRUE']
avg_lmp_chicago_holiday = holiday_PJM[month == 'Dec' &
pnode_name == 'CHICAGO GEN HUB' &
am_pm == 'AM',
.(average_lmp_rt = mean(total_lmp_rt)), by = .(hour, year)] %>%
.[order(year)]
#Plot to visulize data
library(ggplot2)
ggplot(avg_lmp_chicago_holiday, aes(x = hour, y = average_lmp_rt, color = factor(year))) +
geom_line(size = 1.2) +
geom_point(size = 2) +
labs(title = "Avg Hourly LMP on Holidays (December, AM) - CHICAGO GEN HUB",
x = "Hour of Day", y = "Average LMP (Real-Time)",
color = "Year") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#2022 shows the highest average locational marginal price at the Chicago gen hub node on holidays
#The data is being averaged over total_lmp_rt. This could be based on the winter storm chicago may have #been having in the holiday months. We can cross check the data by running a regression against #temperature, and seeing if there is any predictive power between weather and average_lmp_rt
#Introduction to plotting
#creating data_CGH
data_CGH = data_PJM[pnode_name == 'CHICAGO GEN HUB']
#Number of observations refect the number of rows
nrow(data_CGH)
## [1] 26280
data_CGH$day_of_week <- factor(data_CGH$day_of_week,
levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
ggplot(data_CGH[order(day_of_week)], aes(x=day_of_week,y=total_lmp_rt, color =
factor(year))) +
stat_summary() +
labs(color = '', x = '', y = 'Total RT LMP ($/MWh)') +
coord_cartesian(ylim = c(0, 80)) +
scale_y_continuous(expand = c(0,0)) +
theme_classic() +
theme(legend.position = 'top')
## No summary function supplied, defaulting to `mean_se()`
#We can see the total lmp_rt per a given day across 2021-2023. We can see that consistently had the #highest locational margin price throughout the week
#remove color = factor(year)
data_CGH$day_of_week <- factor(data_CGH$day_of_week, levels =
c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday",
"Sunday"))
ggplot(data_CGH[order(day_of_week)], aes(x=day_of_week,y=total_lmp_rt)) +
stat_summary() +
labs(color = '', x = '', y = 'Total RT LMP ($/MWh)') +
coord_cartesian(ylim = c(0, 80)) +
scale_y_continuous(expand = c(0,0)) +
theme_classic() +
theme(legend.position = 'top')
## No summary function supplied, defaulting to `mean_se()`
#When color = factor(year) is removed it looks like we are seeing the average across all 3 years
# By month
data_CGH$month_name <- factor(data_CGH$month, levels =
c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"))
ggplot(data_CGH[order(month)], aes(x = month_name, y=total_lmp_rt, color =
factor(year))) +
stat_summary() +
labs(color = '', x = '', y = 'Total RT LMP ($/MWh)') +
coord_cartesian(ylim = c(0, 80)) +
scale_y_continuous(expand = c(0,0)) +
theme_classic() +
theme(legend.position = 'top')
## No summary function supplied, defaulting to `mean_se()`
#by hour
ggplot(data_CGH, aes(x=hour, y=total_lmp_rt,
color = factor(year))) +
stat_summary() +
labs(color = '', x = '', y = 'Total RT LMP ($/MWh)') +
coord_cartesian(ylim = c(0, 80)) +
scale_y_continuous(expand = c(0,0)) +
theme_classic() +
theme(legend.position = 'top')
## No summary function supplied, defaulting to `mean_se()`
avg_lmp_chicago <- data_CGH[, .(average_lmp_rt = mean(total_lmp_rt)),
by = .(month, year)]
avg_lmp_chicago$month_name <- factor(avg_lmp_chicago$month, levels =
c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"))
ggplot(avg_lmp_chicago[order(month_name)],
aes(x = month_name,
y = average_lmp_rt,
color = factor(year))) +
stat_summary() +
labs(color = '',x = '', y = 'Average RT LMP ($/MWh)') +
coord_cartesian(ylim = c(0,80)) +
theme_classic() +
theme(legend.position = 'top')
## No summary function supplied, defaulting to `mean_se()`
## Warning: Removed 36 rows containing missing values or values outside the scale range
## (`geom_segment()`).
#Chicago Summer '22 saw above higher temperatures #(https://www.weather.gov/lot/Summer_August_2022_Climate_Summaries)
#Also the russia-ukraine war could have been a factor that impact crude oil price and electricity is #generated from oil, among other forms
#pivoting data total_lmp_rt, and total_lmp_da
library(tidyr)
library(ggplot2)
data_CGH_long <- data_CGH[, .(year, month, hour, day_of_week, total_lmp_rt,total_lmp_da)] %>%
pivot_longer(cols = !c(year, month, hour, day_of_week),
names_to = 'lmp_type',
values_to = 'values')
#This goes and makes two columns into rows by creating a new column lmp type and putting
#total_lmp_rt and total_lmp_da as values
head(data_CGH_long)
setDT(data_CGH_long) #go from object to data table
ggplot(data_CGH_long, aes(x=day_of_week,y=values, color = factor(lmp_type))) +
stat_summary() +
labs(color = '', x = '', y = 'LMP ($/MWh)') +
coord_cartesian(ylim = c(0, 100)) +
scale_y_continuous(expand = c(0,0)) +
theme_classic() +
theme(legend.position = 'top',
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
facet_wrap(~year)
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
data_CGH_long$month_name <- factor(data_CGH_long$month, levels =
c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"))
ggplot(data_CGH_long, aes(x=month_name,y=values, color = factor(lmp_type))) +
stat_summary() +
labs(color = '', x = '', y = 'LMP ($/MWh)') +
coord_cartesian(ylim = c(0, 100)) +
scale_y_continuous(expand = c(0,0)) +
theme_classic() +
theme(legend.position = 'top',
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
facet_wrap(~year)
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
CGH_summary_daily <- data_CGH[, .(mean_rt_lmp = mean(total_lmp_rt),
mean_da_lmp = mean(total_lmp_da)),
by = .(day, month,year)]
ggplot(CGH_summary_daily, aes(mean_rt_lmp, color = factor(year))) +
stat_ecdf() +
labs(color = '', x = 'Average RT LMP ($/MWh)',
y = 'Empirical Cumulative Distribution Function') +
theme_classic() +
theme(legend.position = 'top')
CGH_summary_daily <- data_CGH[, .(vol_lmp_rt = sd(total_lmp_rt),
vol_lmp_da = sd(total_lmp_da)),
by = .(day, month,year)]
ggplot(CGH_summary_daily, aes(vol_lmp_rt, color = factor(year))) +
stat_ecdf() +
labs(color = '', x = 'Volatility RT LMP ($/MWh)',
y = 'Empirical Cumulative Distribution Function') +
theme_classic() +
theme(legend.position = 'top')
vol_daily_CGH <- data_CGH[, .(vol_spread = sd(spread, na.rm = TRUE)),
by = .(day, month, year)]
ggplot(vol_daily_CGH[month == 'Aug'], aes(x=vol_spread, color = factor(year))) +
geom_density() +
labs(x = 'Standard Deviation of RT LMP - DA LMP ($/MWh)', y = 'Probability Density
Function', color = '') +
scale_x_continuous(expand = c(0,0)) +
scale_y_continuous(expand = c(0,0)) +
theme_classic() +
theme(legend.position = 'top')
#We are only looking at August for each year. The vol spread is calculated as RT- DA. 2022 has the wides distribution indicating more volatility
#Comparing to the eastern Hub
#1 creating datatable
data_easternHub <- data_PJM[pnode_name == 'EASTERN HUB']
#2 calculating the vol spread
vol_daily_EH <- data_easternHub[, .(vol_spread = sd(spread, na.rm = TRUE)),
by = .(day, month, year)]
#3 Plotting
ggplot(vol_daily_EH[month == 'Aug'], aes(x=vol_spread, color = factor(year))) +
geom_density() +
labs(x = 'Standard Deviation of RT LMP - DA LMP ($/MWh)', y = 'Probability Density
Function', color = '') +
scale_x_continuous(expand = c(0,0)) +
scale_y_continuous(expand = c(0,0)) +
theme_classic() +
theme(legend.position = 'top')
#We see tighter spreads, and lower volatility for the Eastern Hub
#Plot the hourly by day for the month of April 2023 for Chicago GEN hum on RT LMP
# Filter April 2023
april_data <- data_CGH[month == 'Apr' & year == 2023]
ggplot(april_data, aes(x = hour, y = total_lmp_rt)) +
geom_col(color = "red") +
labs(title = "Hourly RT LMP for Chicago Gen Hub (April 2023)",
x = "Hour", y = "RT LMP ($/MWh)") +
theme_classic()
#Negative LMP prices indicate that there was a greater supply than demand of energy. Since we had a surplus of energy and no place to store the cost of carry pushed prices negative.
# Filter May 2023
library(dplyr)
may_data <- data_CGH[month == 'May' & year == 2023]
ggplot(may_data, aes(x = hour, y = total_lmp_rt)) +
geom_col(color = "blue") +
labs(title = "Hourly RT LMP for Chicago Gen Hub (May 2023)",
x = "Hour", y = "RT LMP ($/MWh)") +
theme_classic()
# Filter June 2023
june_data <- data_CGH[month == 'Jun' & year == 2023]
ggplot(june_data, aes(x = hour, y = total_lmp_rt)) +
geom_col(color = "green") +
labs(title = "Hourly RT LMP for Chicago Gen Hub (April 2023)",
x = "Hour", y = "RT LMP ($/MWh)") +
theme_classic()
library(dplyr)
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:data.table':
##
## transpose
mapping <- list(
"Jan" = "Winter",
"Feb" = "Winter",
"Mar" = "Spring",
"Apr" = "Spring",
"May" = "Spring",
"Jun" = "Summer",
"Jul" = "Summer",
"Aug" = "Summer",
"Sep" = "Fall",
"Oct" = "Fall",
"Nov" = "Fall",
"Dec" = "Winter"
)
# Apply the mapping
data_CGH <- data_CGH %>%
mutate(
season = map_chr(month, ~ mapping[[.x]])
)
# Filter April 2023
library(dplyr)
library(ggplot2)
library(dplyr)
library(ggplot2)
# Filter data for 2023
data_2023_cgh <- data_CGH %>% filter(year == 2023)
# Split data by season
season_list <- split(data_2023_cgh, data_2023_cgh$season)
plot_season_lmp <- function(df, season_name) {
ggplot(df, aes(x = hour, y = total_lmp_rt)) +
geom_col(color = "orange") +
labs(title = paste(season_name, "RT LMP (2023)"),
x = "Hour", y = "RT LMP ($/MWh)") +
theme_classic()
}
season_plots <- lapply(names(season_list), function(season_name) {
plot_season_lmp(season_list[[season_name]], season_name)
})
for (p in season_plots) print(p)
#based on the above graph it looks like the highest is around 8:00 PM and the lowest is around 4:00 AM for april 2023. This makes sense as the energy consumption levels when everyone is asleep will be low vs when everyone is awake and home. Though when looking across all seasons, other than spring we don't see prices go negative.
#daily bocplots of RT-DA LMP Spread for 2021
ggplot(vol_daily_CGH[year == 2021], aes(factor(month, levels = month.abb),
vol_spread,
color = factor(month, levels = month.abb))) +
geom_boxplot() +
labs(x = '', y = 'Standard Deviation of RT LMP - Lagged DA LMP ($/MWh)') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust= 1)) +
guides(color ='none') +
theme_classic() +
theme(legend.position = 'top')
#2022
ggplot(vol_daily_CGH[year == 2022], aes(factor(month, levels = month.abb),
vol_spread,
color = factor(month, levels = month.abb))) +
geom_boxplot() +
labs(x = '', y = 'Standard Deviation of RT LMP - Lagged DA LMP ($/MWh)') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust= 1)) +
guides(color ='none') +
theme_classic() +
theme(legend.position = 'top')
#2023
ggplot(vol_daily_CGH[year == 2023], aes(factor(month, levels = month.abb),
vol_spread,
color = factor(month, levels = month.abb))) +
geom_boxplot() +
labs(x = '', y = 'Standard Deviation of RT LMP - Lagged DA LMP ($/MWh)') +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust= 1)) +
guides(color ='none') +
theme_classic() +
theme(legend.position = 'top')
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# Assuming 'vol_daily_CGH' has columns: year, month, day, vol_spread
# If 'month' is a character like "Jan", convert it to numeric
vol_daily_CGH_wide <- vol_daily_CGH %>%
mutate(
month_num = match(month, month.abb), # Converts "Jan" to 1, "Feb" to 2, etc.
date = make_date(year, month_num, day)
) %>%
select(date, vol_spread) # Keep only the columns you want
vol_daily_CGH_wide %>%
filter(vol_spread == max(vol_spread, na.rm = TRUE))
# 2022-12-2023 is the date with the most volatility
library(dplyr)
library(tidyr)
library(data.table)
library(ggplot2)
daily_temp_Chicago <- data_Chicago[, .(date, month, year,
avg_temperature_degrees_fahrenheit,
avg_cooling_degree_days,
avg_heating_degree_days)] %>%
unique() %>%
pivot_longer(
cols = starts_with('avg'),
names_to = 'temp_series',
values_to = 'degrees'
)
setDT(daily_temp_Chicago)
ggplot(daily_temp_Chicago, aes(date, degrees, color = factor(temp_series))) +
geom_point(alpha = 0.5) +
scale_color_manual(values = c('red','blue','black')) +
geom_hline(yintercept = 65, color = 'forestgreen') +
labs(
x = '',
color = '',
y = 'Temperature (Degrees Fahrenheit)'
) +
scale_x_date(date_breaks = '3 months', date_labels = "%b %Y") +
theme_classic() +
theme(
legend.position = 'top',
axis.text.x = element_text(angle = 45, vjust = 1, hjust= 1)
)
#There is a cyclical weather pattern. From a glance it looks like there are more heating days then cooling days.
ggplot(daily_temp_Chicago[temp_series == 'avg_temperature_degrees_fahrenheit'],
aes(degrees, color = factor(year))) +
geom_density() +
labs(color ='', x = 'Average Temperature (Degrees Fahrenheit)',
y = 'Probability Density Function') +
theme_classic() +
theme(legend.position = 'top')
#We notice the shape of the distribution to be skewed to the left. Out of the three years, 2022 looks like it has the most cold temperature days sub 30 degrees Fahrenheit. I believe the bimodal distribution comes from the the increase in probability of temperatures to be near 40-50 and then 75, based on the seasons.
ggplot(daily_temp_Chicago[temp_series == 'avg_temperature_degrees_fahrenheit'],
aes(degrees, color = factor(year))) +
geom_density() +
labs(color = '',
x = 'Average Temperature (Degrees Fahrenheit)',
y = 'Probability Density Function') +
theme_classic() +
theme(legend.position = 'top') +
facet_wrap(~factor(month, levels = month.abb))
#this lines up with our hypothesis of the bimodal model, since we see peaks in the winter seasons and summer seasons. Two higher averages would output that shape.
lmp_temp_Chicago <- data_Chicago[,.(mean_lmp_rt = mean(total_lmp_rt),
mean_lmp_da = mean(total_lmp_da),
vol_spread = sd(spread),
temp = mean(avg_temperature_degrees_fahrenheit),
humidity = mean(relative_humidity_percent)),
by = .(date, month, year)]
ggplot(lmp_temp_Chicago, aes(mean_lmp_rt, color = factor(year))) +
stat_ecdf() +
labs(x = 'Daily Average RT LMP ($/MWh)', y = 'Empirical Cumulative Distribution
Function',
color = '') +
theme_classic() +
theme(legend.position = 'top')
ggplot(lmp_temp_Chicago, aes(temp, color = factor(year))) +
stat_ecdf() +
labs(x = 'Daily Average Temperature (Degrees Fahrenheit)', y = 'Empirical Cumulative
Distribution Function',
color = '') +
theme_classic() +
theme(legend.position = 'top')
#As stated above there is a clear FOSD with 2022 utilizing the highest daily average RT LMP. This is not consistent with the temperature graphs as we see 2023 is a warmer year than 2022. Temperature patterns may not have have translated to LMP patterns because there could have been more extreme volatile changes in certain days in 2022 (like in the winter), which drive LMP higher to be higher
data_Chicago[, change_temp := shift(avg_temperature_degrees_fahrenheit, type = "lead") - avg_temperature_degrees_fahrenheit]
ggplot(data_Chicago, aes(x = month, y = change_temp, color = factor(year))) +
geom_line() +
labs(title = "Daily Percent Change in Temperature by Year",
x = "Date",
y = "Percent Change in Temperature (%)",
color = "Year") +
theme_classic()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
ggplot(data_Chicago, aes(x = date, y = change_temp, color = factor(year))) +
geom_line() +
labs(title = "Daily Percent Change in Temperature by Year",
x = "Date",
y = "Percent Change in Temperature (%)",
color = "Year") +
theme_classic()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
years <- c(2021, 2022, 2023)
for (y in years) {
sd_val <- data_Chicago[year == y, sd(avg_temperature_degrees_fahrenheit)]
print(paste("Year:", y, "SD:", sd_val))
}
## [1] "Year: 2021 SD: 19.7821463164315"
## [1] "Year: 2022 SD: 20.651650568102"
## [1] "Year: 2023 SD: 17.5274362608774"
#We say the highest volatility in the average daily change in temp in 2022, which helps explain the increase in LMP.
ggplot(lmp_temp_Chicago,
aes(x = temp, y= log(vol_spread)) ) +
geom_hdr(xlim = c(-25, 100), ylim = c(0,10)) +
geom_point(shape = 21) +
labs(fill = '', alpha = 'Probability',
x = 'Average Daily Temperature (Degrees Fahrenheit)',
y = 'Logged Standard Deviation of (Lagged DA LMP - RT LMP)') +
theme_classic() +
theme(legend.position = 'top') +
facet_grid(~year)
#25.We see notable outliers in 2022 and 2023. The outliers may be due to drastic changes in temperature DoD and the impact it had on LMP. If it drastically got colder or hotter on certain 2 day periods then the necessity for heating/cooling becomes much more apparent.
library(MASS)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
dens_Chicago <- kde2d(lmp_temp_Chicago
[date > '2021-01-01']
$temp, log(lmp_temp_Chicago
[date>'2021-01-01']$vol_spread))
plot_ly(x = dens_Chicago$x, y= dens_Chicago$y, z = dens_Chicago$z) %>%
add_surface() %>%
layout(scene = list(xaxis = list(title = 'Avg Temperature'),
yaxis = list(title = 'Log.SD (DA LMP - RT LMP)'),
zaxis = list(title = 'PDF')))
#26 We can asset X to be the avg temperature, Y to be Log of the standard deviation (DA LMP - RT LMP), and Z to be the PDF. Given that the graph represent the density (volume) where the average temperature is at a certain level for the log of (DA LMP - RT LMP). The peak is shows the highest density and the valleys show the lowest density.
lmp_cities <- data.table(
date = data_Milwaukee$date,
Milwaukee = data_Milwaukee$DA_LMP,
Chicago = data_Chicago$total_lmp_da,
da_spread = data_Chicago$total_lmp_da - data_Milwaukee$DA_LMP
)
ggplot(lmp_cities, aes (x=date, y= da_spread)) +
stat_summary(size = 0.2) +
labs(x = '', y = 'Chicago DA LMP - Milwaukee DA LP ($/MWh)' ) +
theme_classic()
## Warning: Removed 48 rows containing non-finite outside the scale range
## (`stat_summary()`).
## No summary function supplied, defaulting to `mean_se()`
lmp_cities <- data.table(
date = data_Milwaukee$date,
Milwaukee = data_Milwaukee$DA_LMP,
Chicago = data_Chicago$total_lmp_da,
Chicago_Weather = data_Chicago$avg_temperature_degrees_fahrenheit,
Milwaukee_Weather = data_Milwaukee$avg_temp_f,
avg_weather_spread = data_Chicago$avg_temperature_degrees_fahrenheit -
data_Milwaukee$avg_temp_f,
da_spread = data_Chicago$total_lmp_da - data_Milwaukee$DA_LMP
)
ggplot(lmp_cities, aes (x=avg_weather_spread, y= date)) +
stat_summary(size = 0.2) +
labs(x = '', y = 'Chicago DA LMP - Milwaukee DA LP ($/MWh)' ) +
theme_classic()
## No summary function supplied, defaulting to `mean_se()`
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_segment()`).
#27. We can see outliers in the weather spreada around the same time as the da_spread plot. To verify results it would be best to runa regression
temperature_cities <- data.table(
date = data_Milwaukee$date,
Milwaukee = data_Milwaukee$avg_temp_f,
Chicago = data_Chicago$avg_temperature_degrees_fahrenheit) %>%
unique()
ggplot(temperature_cities, aes(x = Milwaukee, y = Chicago)) +
geom_point(alpha = 0.2) +
geom_abline(intercept = 0, slope = 1,color='red') +
labs(x='Milwaukee Average Daily Temperature (Degrees Fahrenheit)',
y='Chicago Average Daily Temperature (Degrees Fahrenheit)',
color = '') +
facet_wrap(~year(date)) +
theme_classic()
lmp_cities_avg <- data.table(
date = data_Milwaukee$date,
Milwaukee_mean_lmp_da = data_Milwaukee$DA_LMP,
Chicago_mean_lmp_da = data_Chicago$total_lmp_da) %>%
unique()
ggplot(lmp_cities_avg, aes(x = Milwaukee_mean_lmp_da, y = Chicago_mean_lmp_da)) +
geom_point(alpha = 0.2) +
geom_abline(intercept = 0, slope = 1,color='red') +
labs(x='Milwaukee Average Daily Temperature (Degrees Fahrenheit)',
y='Chicago Average Daily Temperature (Degrees Fahrenheit)',
color = '') +
facet_wrap(~year(date)) +
theme_classic()
## Warning: Removed 48 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(temperature_cities %>% pivot_longer(!date, names_to = 'series', values_to =
'temp'),
aes(sample = temp, color = series)) +
geom_qq() +
geom_qq_line() +
labs(x='Theoretical Normal Distribution Quantiles',
y='Sample Quantiles (Degrees Fahrenheit)',
color = '') +
theme_classic() +
theme(legend.position = 'top')
#29
temperature_cities_Austin_CC <- data.table(
date = data_Austin$date,
Austin = data_Austin$avg_temperature_degrees_fahrenheit,
CC = data_CorpusChristi$avg_temp_f) %>%
unique()
ggplot(temperature_cities_Austin_CC %>% pivot_longer(!date, names_to = 'series', values_to =
'temp'),
aes(sample = temp, color = series)) +
geom_qq() +
geom_qq_line() +
labs(x='Theoretical Normal Distribution Quantiles',
y='Sample Quantiles (Degrees Fahrenheit)',
color = '') +
theme_classic() +
theme(legend.position = 'top')
#30 What do observe from plots, talk about relationship between price and weather. Is it linear? Descibe?
#for the graph of Chicago and Milwaukee the Average temperature ranges from 0 to 45 degrees. As we see temperature decrease we see LMP increase showing a linear relationship from a glance. For Corpus Christi we also see a potenital exponteial relationship. the range in temperature goes from 20 degrees to 80 degrees, about 25% greater in range than Chicago. When we reach a min of 20 in Corpus Christi we see Real Time LMP shoot up to 2000, possibly due to lack of infrastructure to account for extreame weatehrs. Based on the line graphs they do seem to be drawn from the same distribution
ggplot(data_Chicago[order(date)] %>%
.[,prev_lmp_da := shift(total_lmp_da, type = 'lag'), by = hour],
aes(x = date, y = total_lmp_da - prev_lmp_da)) +
geom_point(shape = 21, size = 0.8, alpha = 0.5) +
labs(x = '',y = 'DA LMP (t+1) - DA LMP (t)') +
theme_classic()
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).
#Each point represents the hourly difference in LMP prices in Chicago. Most changes over an hour are no different thats why they are surrounded near 0. Because the weather is not changing extremely for majority of the time. But some hourly changes have extreme changes in weather, which could be driving the change in LMP up or down. It seems the winter season for each month have the largest DoD changes
#verify above
#I hypothesize the grater change in temperature has greater DoD changes in DA LMP which is seen the most volatile in the winter season of each year.
ggplot(data_Chicago[order(date)] %>%
.[,prev_lmp_da := shift(total_lmp_da, type = 'lag'), by = month],
aes(x = date, y = total_lmp_da - prev_lmp_da)) +
geom_point(shape = 21, size = 0.8, alpha = 0.5) +
labs(x = '',y = 'DA LMP (t+1) - DA LMP (t)') +
theme_classic()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
data_Chicago$month_name <- factor(data_Chicago$month, levels =
c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"))
ggplot(data_Chicago[order(date)] %>%
.[,prev_lmp_da := shift(total_lmp_da, type = 'lag'), by = day_of_week],
aes(x = month_name, y = total_lmp_da - prev_lmp_da)) +
geom_point(shape = 21, size = 0.8, alpha = 0.5) +
labs(x = '',y = 'DA LMP (t+1) - DA LMP (t)') +
facet_wrap(~year(date)) +
theme_classic()
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(data_Chicago[order(date)] %>%
.[,prev_lmp_da := shift(total_lmp_da, type = 'lag'), by = hour],
aes(x = day_of_week, y = total_lmp_da - prev_lmp_da)) +
geom_point(shape = 21, size = 0.8, alpha = 0.5) +
labs(x = '',y = 'DA LMP (t+1) - DA LMP (t)') +
#facet_wrap(~year(date)) +
theme_classic()
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).
#33. We see that the change in spread occurs the most in the Winter months. Furthermore there is an increase in volatiltiy during the weekend as opposed to the weeday when people are working
#33
internodal_spread <- data.table(date = data_Chicago$date,
hour = data_Chicago$hour,
year = data_Chicago$year,
day_of_week = data_Chicago$day_of_week,
month = data_Chicago$month,
da_Chicago = data_Chicago$total_lmp_da,
da_Milwaukee = data_Milwaukee$DA_LMP,
da_CorpusChristi = data_CorpusChristi$DA_LMP) %>%
.[order(date)] %>%
.[,':=' (prev_da_Chicago = shift(da_Chicago, fill = NA, type = 'lag'),
prev_da_Milwaukee = shift(da_Milwaukee, fill = NA, type = 'lag'),
prev_da_CorpusChristi = shift(da_CorpusChristi, fill = NA, type = 'lag')),
by = hour] %>%
.[, ':=' (Chicago_da_spread = da_Chicago - prev_da_Chicago,
Milwaukee_da_spread = da_Milwaukee - prev_da_Milwaukee,
CorpusChristi_da_spread = da_CorpusChristi - prev_da_CorpusChristi)] %>%
.[, spread_ratio := Chicago_da_spread/Milwaukee_da_spread]
internodal_spread$month_name <- factor(internodal_spread$month, levels =
c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"))
ggplot(internodal_spread[order(date) & year == 2021], aes(x = month_name, y = spread_ratio, color = factor(month_name))) +
geom_point(size = 2) +
labs(title = "Spread Ratio Chicago vs Milwauke",
x = "Hourly Change", y = "Spread Ratio",
color = "Year") +
#facet_wrap(~year(date)) +
theme_minimal()
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(internodal_spread[order(date) & year == 2022], aes(x = month_name, y = spread_ratio, color = factor(month_name))) +
geom_point(size = 2) +
labs(title = "Spread Ratio Chicago vs Milwauke",
x = "Hourly Change", y = "Spread Ratio",
color = "Year") +
#facet_wrap(~year(date)) +
theme_minimal()
ggplot(internodal_spread[order(date) & year == 2023], aes(x = month_name, y = spread_ratio, color = factor(month_name))) +
geom_point(size = 2) +
labs(title = "Spread Ratio Chicago vs Milwauke",
x = "Hourly Change", y = "Spread Ratio",
color = "Year") +
#facet_wrap(~year(date)) +
theme_minimal()
## Warning: Removed 72 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(internodal_spread[order(date)], aes(x = month_name, y = spread_ratio, color = factor(month_name))) +
geom_point(size = 2) +
labs(title = "Spread Ratio Chicago vs Milwauke",
x = "Hourly Change", y = "Spread Ratio",
color = "Year") +
facet_wrap(~year(date)) +
theme_minimal()
## Warning: Removed 96 rows containing missing values or values outside the scale range
## (`geom_point()`).
#33 We can see the spreads have gotten tighter in 2023 compared to 2021 and 2022. Larger values indicate a greater magintude in difference between the DoD change in LMP prices in Chicago compared to Milwaukee. The tighter ratio the closer the DoD changes are in both cities. There may be more incentives for transmission and storage in 2023 in order to keep tighter spreads
internodal_spread$spread_ratio_ercot <- internodal_spread$Chicago_da_spread /
internodal_spread$CorpusChristi_da_spread
ggplot(internodal_spread[order(date)], aes(x = month_name, y = spread_ratio_ercot, color = factor(month_name))) +
geom_point(size = 2) +
labs(title = "Spread Ratio Chicago vs Milwauke",
x = "Hourly Change", y = "Spread Ratio",
color = "Year") +
facet_wrap(~year(date)) +
theme_minimal()
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).
#34 Larger values indicate a higher difference in the DoD change in LMP in Chicago vs Corpus Christi. We want tighter spreads to help [fill in here] and this in turn impacts energy transmission and storage by [fill in here]
count1 <- sum(internodal_spread$spread_ratio > 1, na.rm = TRUE)
count2 <- sum(internodal_spread$spread_ratio_ercot > 1, na.rm = TRUE)
print(count1)
## [1] 8860
print(count2)
## [1] 6082
count3 <- sum(internodal_spread$spread_ratio > 0, na.rm = TRUE)
count4 <- sum(internodal_spread$spread_ratio < 0, na.rm = TRUE)
print(count3)
## [1] 16392
print(count4)
## [1] 9792
#35. I have no idea how to interpret this honestly
internodal_spread[, lagged_spread_ratio := shift(spread_ratio, fill = NA, type = 'lag') ]
internodal_spread[, lookahead_spread_ratio := lagged_spread_ratio/spread_ratio]
plot_data <- melt(
internodal_spread,
id.vars = "date",
measure.vars = c("spread_ratio", "lookahead_spread_ratio"),
variable.name = "Metric",
value.name = "Value"
)
ggplot(plot_data, aes(x = date, y = Value, color = Metric)) +
# Plot spread_ratio with alpha = 0.5
geom_point(data = plot_data[Metric == "spread_ratio"],
shape = 21, size = 5, alpha = 0.5) +
# Plot lookahead_spread_ratio with alpha = 1
geom_point(data = plot_data[Metric == "lookahead_spread_ratio"],
shape = 21, size = 0.3, alpha = 0.01) +
labs(
x = '',
y = 'Spread Ratios',
title = 'Spread Ratio vs Lookahead Spread Ratio Over Time',
color = 'Metric'
) +
theme_classic()
## Warning: Removed 96 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 98 rows containing missing values or values outside the scale range
## (`geom_point()`).
#36 this shows how the ratio changes hour over hour, and the magntitude of the change. In the graph we can see that the lookahead ratio is tighter around 0 compared to the former
daily_PJM <- fread('PJM Daily.csv')
ldc <- daily_PJM[, .(Date, Actual_load_GWh)] %>%
.[, Date := as.Date(Date, format = "%m/%d/%Y")] %>%
.[, rank := frank(-Actual_load_GWh) / .N, by = year(Date)]
ggplot(ldc, aes(rank, Actual_load_GWh, color = factor(year(Date)))) +
geom_line() +
scale_x_continuous(expand = expand_scale(mult=c(0,0.1)), limits = c(0,NA))+
scale_y_continuous(expand = expand_scale(mult=c(0,0.1))) +
labs(x = 'Percent of Days', y = 'Actual Load (GWh)', color = '',
title = 'PJM Annual Load Duration Curves') +
theme_classic() +
theme(plot.title = element_text(hjust=0.5),
legend.position = 'top',
plot.caption = element_text(hjust = 0))
## Warning: `expand_scale()` was deprecated in ggplot2 3.3.0.
## ℹ Please use `expansion()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#2018 has the highest demand for electrcity
ldc <- daily_PJM[, .(Date, rtm_price)] %>%
.[, Date := as.Date(Date, format = "%m/%d/%Y")] %>%
.[, rank := frank(-rtm_price) / .N, by = year(Date)]
ggplot(ldc, aes(rank, rtm_price, color = factor(year(Date)))) +
geom_line() +
scale_x_continuous(expand = expand_scale(mult=c(0,0.1)), limits = c(0,NA))+
scale_y_continuous(expand = expand_scale(mult=c(0,0.1))) +
labs(x = 'Percent of Days', y = 'RTM Price', color = '',
title = 'PJM Annual Load Duration Curves') +
theme_classic() +
theme(plot.title = element_text(hjust=0.5),
legend.position = 'top',
plot.caption = element_text(hjust = 0))
#38. This is consistent with our conclusion as the Load is higher the price is higher
load_mw_comp <- data.table(date = data_Austin$date,
load_mw_Austin = data_Austin$load_mw,
load_mw_CorpusChristi = data_CorpusChristi$load_mw,
load_mw_Milwaukee = data_Milwaukee$load_mw)
# Convert from wide to long format
load_mw_long <- pivot_longer(
load_mw_comp,
cols = starts_with("load_mw_"),
names_to = "city",
values_to = "load_mw"
)
ggplot(load_mw_long, aes(x = date, y = load_mw, color = city)) +
geom_line() +
labs(title = "Load in Megawatts Over Time",
x = "Date",
y = "Load (MW)",
color = "City") +
theme_minimal()
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).
# Plot all cities on one plot
# Ensure Year and rank exist
library(data.table)
library(ggplot2)
library(lubridate)
setDT(load_mw_long) # Convert tibble/data.frame to data.table
load_mw_long[, Year := year(date)]
# Step 2: Create the rank column (percent of days), per city-year
load_mw_long[, rank := frank(-load_mw) / .N, by = .(city)]
# Step 3: Create a city-year combo column
load_mw_long[, city_year := paste(city, Year)]
# Step 4: Now plot!
ggplot(load_mw_long, aes(x = rank, y = load_mw, color = city)) +
geom_line() +
labs(
x = 'Percent of Days',
y = 'Load (MW)',
color = 'City-Year',
title = 'Annual Load Duration Curves'
) +
theme_minimal()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
#39. Typically colder cities have higher loads
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.